{
 "cells": [
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T01:58:55.830793Z",
     "start_time": "2024-09-09T01:58:55.826549Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import nltk\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "import pandas as pd\n",
    "from umap import UMAP\n",
    "from bertopic import BERTopic\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from bertopic.vectorizers import ClassTfidfTransformer\n",
    "from bertopic.representation import KeyBERTInspired\n",
    "from sentence_transformers import SentenceTransformer\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from nltk.corpus import wordnet\n",
    "from fuzzywuzzy import fuzz\n",
    "from fuzzywuzzy import process"
   ],
   "id": "716184e9a2059464",
   "outputs": [],
   "execution_count": 48
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T01:38:51.353458Z",
     "start_time": "2024-09-09T01:38:50.165426Z"
    }
   },
   "cell_type": "code",
   "source": [
    "nltk.download(\"wordnet\")\n",
    "nltk.download(\"omw-1.4\")"
   ],
   "id": "b6612e1d6f514a41",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package wordnet to\n",
      "[nltk_data]     /Users/moritzblum/nltk_data...\n",
      "[nltk_data] Downloading package omw-1.4 to\n",
      "[nltk_data]     /Users/moritzblum/nltk_data...\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 36
  },
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2024-09-09T00:47:26.776460Z",
     "start_time": "2024-09-09T00:47:26.715376Z"
    }
   },
   "source": [
    "# Read all data into a single dataframe\n",
    "acl_files = [\n",
    "    'data/nlp/raw/2017_ACL.csv',\n",
    "    'data/nlp/raw/2018_ACL.csv',\n",
    "    'data/nlp/raw/2019_ACL.csv',\n",
    "    'data/nlp/raw/2020_ACL.csv',\n",
    "    'data/nlp/raw/2021_ACL.csv',\n",
    "    'data/nlp/raw/2022_ACL.csv',\n",
    "    'data/nlp/raw/2023_ACL.csv'\n",
    "]\n",
    "\n",
    "acl_data = pd.concat([pd.read_csv(file) for file in acl_files], ignore_index=True)"
   ],
   "outputs": [],
   "execution_count": 4
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T00:47:40.281743Z",
     "start_time": "2024-09-09T00:47:40.274539Z"
    }
   },
   "cell_type": "code",
   "source": "acl_data.head()",
   "id": "15d02189811bdcdc",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "                                 link  \\\n",
       "0  https://aclanthology.org/P17-1001/   \n",
       "1  https://aclanthology.org/P17-1002/   \n",
       "2  https://aclanthology.org/P17-1003/   \n",
       "3  https://aclanthology.org/P17-1004/   \n",
       "4  https://aclanthology.org/P17-1005/   \n",
       "\n",
       "                                               title  \\\n",
       "0  Adversarial Multi-task Learning for Text Class...   \n",
       "1  Neural End-to-End Learning for Computational A...   \n",
       "2  Neural Symbolic Machines: Learning Semantic Pa...   \n",
       "3  Neural Relation Extraction with Multi-lingual ...   \n",
       "4  Learning Structured Natural Language Represent...   \n",
       "\n",
       "                                            abstract  \n",
       "0  Neural network models have shown their promisi...  \n",
       "1  We investigate neural techniques for end-to-en...  \n",
       "2  Harnessing the statistical power of neural net...  \n",
       "3  Relation extraction has been widely used for f...  \n",
       "4  We introduce a neural semantic parser which is...  "
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>link</th>\n",
       "      <th>title</th>\n",
       "      <th>abstract</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https://aclanthology.org/P17-1001/</td>\n",
       "      <td>Adversarial Multi-task Learning for Text Class...</td>\n",
       "      <td>Neural network models have shown their promisi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>https://aclanthology.org/P17-1002/</td>\n",
       "      <td>Neural End-to-End Learning for Computational A...</td>\n",
       "      <td>We investigate neural techniques for end-to-en...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>https://aclanthology.org/P17-1003/</td>\n",
       "      <td>Neural Symbolic Machines: Learning Semantic Pa...</td>\n",
       "      <td>Harnessing the statistical power of neural net...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>https://aclanthology.org/P17-1004/</td>\n",
       "      <td>Neural Relation Extraction with Multi-lingual ...</td>\n",
       "      <td>Relation extraction has been widely used for f...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>https://aclanthology.org/P17-1005/</td>\n",
       "      <td>Learning Structured Natural Language Represent...</td>\n",
       "      <td>We introduce a neural semantic parser which is...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 5
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T00:48:26.928937Z",
     "start_time": "2024-09-09T00:48:26.924701Z"
    }
   },
   "cell_type": "code",
   "source": [
    "texts = acl_data[\"abstract\"].astype(\"str\")\n",
    "texts.shape"
   ],
   "id": "e89c8e3241afa188",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4605,)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 6
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T00:54:31.855344Z",
     "start_time": "2024-09-09T00:54:09.302149Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# create BERTopic Extractor\n",
    "umap_model=UMAP(n_neighbors=20,n_components=50,metric=\"cosine\",min_dist=0.0,random_state=37)\n",
    "vectorizer_model=CountVectorizer(ngram_range=(2,4),stop_words=\"english\")\n",
    "ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=False)\n",
    "sentence_model=SentenceTransformer(\"sentence-transformers/all-mpnet-base-v2\")\n",
    "representation_model = KeyBERTInspired()\n",
    "\n",
    "topic_model=BERTopic(verbose=True,\n",
    "                     umap_model=umap_model,\n",
    "                     ctfidf_model=ctfidf_model,\n",
    "                     vectorizer_model=vectorizer_model,\n",
    "                     embedding_model=sentence_model,\n",
    "                     representation_model=representation_model,\n",
    "                     nr_topics=50,\n",
    "                     low_memory=True,\n",
    "                     calculate_probabilities=False)"
   ],
   "id": "6f225ac52de049fe",
   "outputs": [],
   "execution_count": 9
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T00:57:11.873826Z",
     "start_time": "2024-09-09T00:55:00.762603Z"
    }
   },
   "cell_type": "code",
   "source": "topics, _ = topic_model.fit_transform(texts)",
   "id": "a3c471fa1340f2b1",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-09-09 09:55:00,770 - BERTopic - Embedding - Transforming documents to embeddings.\n",
      "Batches: 100%|██████████| 144/144 [01:15<00:00,  1.91it/s]\n",
      "2024-09-09 09:56:16,186 - BERTopic - Embedding - Completed ✓\n",
      "2024-09-09 09:56:16,188 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n",
      "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n",
      "2024-09-09 09:56:31,763 - BERTopic - Dimensionality - Completed ✓\n",
      "2024-09-09 09:56:31,764 - BERTopic - Cluster - Start clustering the reduced embeddings\n",
      "2024-09-09 09:56:31,973 - BERTopic - Cluster - Completed ✓\n",
      "2024-09-09 09:56:31,973 - BERTopic - Representation - Extracting topics from clusters using representation models.\n",
      "2024-09-09 09:56:58,538 - BERTopic - Representation - Completed ✓\n",
      "2024-09-09 09:56:58,563 - BERTopic - Topic reduction - Reducing number of topics\n",
      "2024-09-09 09:57:11,105 - BERTopic - Topic reduction - Reduced number of topics from 93 to 50\n"
     ]
    }
   ],
   "execution_count": 10
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T00:58:30.215692Z",
     "start_time": "2024-09-09T00:58:30.208647Z"
    }
   },
   "cell_type": "code",
   "source": "all_topics = topic_model.get_topics()",
   "id": "8d243c70ae36ff59",
   "outputs": [],
   "execution_count": 11
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T00:58:48.941751Z",
     "start_time": "2024-09-09T00:58:48.933672Z"
    }
   },
   "cell_type": "code",
   "source": [
    "concepts=[]\n",
    "\n",
    "for topic_num, keywords in all_topics.items():\n",
    "    if topic_num != -1:\n",
    "        topic_keywords = [word for word, value in keywords]\n",
    "        concepts.extend(topic_keywords)"
   ],
   "id": "1e5943eb194005c8",
   "outputs": [],
   "execution_count": 12
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T01:00:18.277483Z",
     "start_time": "2024-09-09T01:00:18.269076Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# remove duplicates\n",
    "concepts = list(set(keyword.lower() for keyword in concepts))"
   ],
   "id": "f3eb13d59e9dfcd3",
   "outputs": [],
   "execution_count": 15
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T01:00:19.199669Z",
     "start_time": "2024-09-09T01:00:19.193676Z"
    }
   },
   "cell_type": "code",
   "source": "len(concepts)",
   "id": "1ed0b3ffd88ba32e",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "476"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 16
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T01:35:13.564916Z",
     "start_time": "2024-09-09T01:35:13.560333Z"
    }
   },
   "cell_type": "code",
   "source": [
    "with open(\"data/nlp/extracted_concepts.tsv\", \"w\") as f:\n",
    "    for id, concept in enumerate(concepts, 1):\n",
    "        f.write(f\"{id}|{concept}\\n\")"
   ],
   "id": "e8712f254b46b925",
   "outputs": [],
   "execution_count": 26
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T01:35:38.613584Z",
     "start_time": "2024-09-09T01:35:38.602736Z"
    }
   },
   "cell_type": "code",
   "source": [
    "extracted_concepts = pd.read_csv(\"data/nlp/extracted_concepts.tsv\", delimiter=\"|\", header=None)\n",
    "extracted_concepts = extracted_concepts[1].tolist()\n",
    "\n",
    "gold_concepts = pd.read_csv(\"data/nlp/raw/gold_concepts.tsv\", delimiter=\"|\", header=None)\n",
    "gold_concepts = gold_concepts[1].tolist()"
   ],
   "id": "3542e95763a9f4dc",
   "outputs": [],
   "execution_count": 28
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T01:35:49.327831Z",
     "start_time": "2024-09-09T01:35:49.325483Z"
    }
   },
   "cell_type": "code",
   "source": "len(extracted_concepts)",
   "id": "d8a2c0ff20265daf",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "476"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 30
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T01:38:25.021842Z",
     "start_time": "2024-09-09T01:38:25.018046Z"
    }
   },
   "cell_type": "code",
   "source": [
    "lemmatizer = WordNetLemmatizer()\n",
    "\n",
    "def singularize_concept(concept):\n",
    "    words = concept.split()\n",
    "    singular_words = [lemmatizer.lemmatize(word, wordnet.NOUN) for word in words]\n",
    "    return ' '.join(singular_words)"
   ],
   "id": "66c49b69b7589557",
   "outputs": [],
   "execution_count": 34
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T01:40:27.954359Z",
     "start_time": "2024-09-09T01:40:27.939436Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# singularize concepts\n",
    "gold_concept = [singularize_concept(concept) for concept in gold_concepts]\n",
    "extracted_concept = [singularize_concept(concept) for concept in extracted_concepts]\n",
    "\n",
    "# convert to lowercase\n",
    "gold_concept = [concept.lower() for concept in gold_concept]\n",
    "extracted_concept = [concept.lower() for concept in extracted_concept]"
   ],
   "id": "ef916e85711040cf",
   "outputs": [],
   "execution_count": 38
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T01:41:43.131463Z",
     "start_time": "2024-09-09T01:41:43.116330Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# create dataframe (column label indicated the source of the concept: 0=extracted, 1=gold)\n",
    "df_old = pd.DataFrame(extracted_concept, columns=[\"concept\"])\n",
    "df_old[\"label\"] = 0\n",
    "\n",
    "df_new = pd.DataFrame(gold_concept, columns=[\"concept\"])\n",
    "df_new[\"label\"] = 1\n",
    "\n",
    "df = pd.concat([df_old, df_new])\n",
    "df = df.sort_values(by=\"label\")\n",
    "\n",
    "df = df.drop_duplicates(subset=\"concept\", keep=\"first\")"
   ],
   "id": "7d9a7292be184896",
   "outputs": [],
   "execution_count": 39
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T01:41:48.384543Z",
     "start_time": "2024-09-09T01:41:48.380354Z"
    }
   },
   "cell_type": "code",
   "source": "df.shape",
   "id": "373dd9cec439807c",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(945, 2)"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 40
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T01:54:07.044880Z",
     "start_time": "2024-09-09T01:44:59.125995Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# reduce the text dataset to only texts containing the concepts\n",
    "\n",
    "def filter_abstracts_by_term(term, abstracts, threshold=70):\n",
    "    filtered_abstracts = []\n",
    "    for abstract in abstracts:\n",
    "        if isinstance(abstract, str):\n",
    "            if fuzz.partial_ratio(term.lower(), abstract.lower()) >= threshold:\n",
    "                filtered_abstracts.append(abstract)\n",
    "    return filtered_abstracts\n",
    "\n",
    "concept_abstracts = {}\n",
    "for index, row in tqdm(df.iterrows(), desc=\"Processing concepts\", total=df.shape[0]):\n",
    "    concept = row[\"concept\"]\n",
    "    label = row[\"label\"]\n",
    "    filtered_abstracts = filter_abstracts_by_term(concept, texts)\n",
    "    concept_abstracts[concept] = {\n",
    "        \"abstracts\": filtered_abstracts,\n",
    "        \"label\": label\n",
    "    }"
   ],
   "id": "493357d24acf2184",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processing concepts: 100%|██████████| 945/945 [09:07<00:00,  1.72it/s]\n"
     ]
    }
   ],
   "execution_count": 44
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T01:58:46.641410Z",
     "start_time": "2024-09-09T01:58:46.634711Z"
    }
   },
   "cell_type": "code",
   "source": [
    "label_0_count = sum(1 for details in concept_abstracts.values() if details['label'] == 0)\n",
    "print(f\"Number of concepts added through BERTopic: {label_0_count}\")"
   ],
   "id": "2d163708f80d455",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of concepts added through BERTopic: 459\n"
     ]
    }
   ],
   "execution_count": 45
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T01:58:48.642325Z",
     "start_time": "2024-09-09T01:58:48.637552Z"
    }
   },
   "cell_type": "code",
   "source": [
    "empty_abstracts_count = sum(1 for details in concept_abstracts.values() if not details['abstracts'])\n",
    "print(f\"Number of concepts with empty filtered_abstracts: {empty_abstracts_count}\")"
   ],
   "id": "bc447db46bfc3916",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of concepts with empty filtered_abstracts: 67\n"
     ]
    }
   ],
   "execution_count": 46
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-09T01:58:58.627853Z",
     "start_time": "2024-09-09T01:58:58.513Z"
    }
   },
   "cell_type": "code",
   "source": [
    "output_file_path = \"data/nlp/concept_abstracts.json\"\n",
    "with open(output_file_path, 'w', encoding='utf-8') as f:\n",
    "    json.dump(concept_abstracts, f, ensure_ascii=False, indent=4)"
   ],
   "id": "924fa49660df1607",
   "outputs": [],
   "execution_count": 49
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "",
   "id": "e336ca881b7ad088"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
